

# run create_names_gender.r to create:
load('data/names_gender.rds')
setDT(names)

normalize_zip = function(zipcode) {
  zipcode = as.character(zipcode) %>% str_trim()
  zipcode = ifelse(str_detect(zipcode, '^\\d+\\-\\d+'),
                   str_extract(zipcode, '^\\d+'),
                   zipcode)
  nas = is.na(zipcode)
  zipcode = ifelse(nchar(zipcode) > 5,
                   str_extract(zipcode, '(.*)(?=\\d{4})'),
                   zipcode)
  zipcode = ifelse(nchar(zipcode) < 5, str_pad(zipcode, 5, 'left', '0'), zipcode)
  zipcode[nas] = NA_character_
  zipcode = ifelse(zipcode == '00000' |
                     zipcode == '99999', NA_character_, zipcode)
  return(zipcode)
}

po_box_parser_fec = function(address_field_1, address_field_2) {
  # Adapted from: https://stackoverflow.com/questions/5680050/po-box-regular-expression-validation/25271289#25271289
  # Changes: changed p(ost) to p(ost(al)) and removed beginning caret
  # Accepts two address fields (use for FEC data)
  
  po_string = "(?i)^ *(((p o box|p o drawer|pmb|box|bin)[-. \\/\\\\]?\\d+)|(postoffice)|(pro box? *\\d+)|(post office box)|(post office)|(po drawer)|(p(ost(al))? *(drawer)? *\\d+)|(.*p[ \\.]? ?(o|0|op)[-. \\/\\\\]? *-?((box|bin|nox)|b|(#|num)?\\d+))|(p(ost(al))? *(o(ff(ice)?)?)? *((box|bin)|b)? *\\d+)|(p *-?\\/?(o)? *-?box)|post office box|((box|bin)|b) *(number|num|#)? *\\d*\\d+)"
  po_extrac = "(?i) *(((p o box|p o drawer|pmb|box|bin)[-. \\/]?)|(postoffice)|(pro box? *)|(post office)|(post office box)|(po drawer)|(p(ost(al))? *(drawer))|(.*p[ \\.]? ?(o|0|op)[-. \\/]? *-?((box|bin|nox)|b|(#|num)?))|(p(ost(al))? *(o(ff(ice)?)?)? *((box|bin)|b)? *)|(p *-?\\/?(o)? *-?box)|post office box|((box|bin)|b) *(number|num|#)? *)"
  
  is_po_1 = str_detect(address_field_1, po_string) &
    !is.na(address_field_1)
  is_po_2 = str_detect(address_field_2, po_string) &
    !is.na(address_field_2)
  
  po_nums_1 = address_field_1[is_po_1] %>%
    str_split(pattern = str_extract_all(., pattern = po_extrac, simplify = T)[, 1]) %>%
    map_chr( ~ str_squish(.x) %>% str_replace("\\D*(\\d+).*", "\\1") %>% str_c(collapse =
                                                                                 ''))
  
  po_nums_2 = address_field_2[is_po_2] %>%
    str_split(pattern = str_extract_all(., pattern = po_extrac, simplify = T)[, 1]) %>%
    map_chr( ~ str_squish(.x) %>% str_replace("\\D*(\\d+).*", "\\1") %>% str_c(collapse =
                                                                                 ''))
  
  out = rep(NA, length(address_field_1))
  out[is_po_1] = po_nums_1
  out[is_po_2] = po_nums_2 # will use second one if there's more than one
  out = str_squish(out)
  out[out == ''] = NA
  
  return(out)
  
}

merge_fields = function(f1, f2)
  return(str_c(str_replace_na(f1, ''), ' ', str_replace_na(f2, '')) %>% str_squish())

usps_address = function(address) {
  # Adapted from fastLink::preprocText
  address %>%
    str_to_lower %>%
    enc2utf8 %>%
    str_replace_all(
      c(
        'boulevard' = 'blvd',
        'circle' = 'cir',
        'court' = 'ct',
        'drive' = 'dr',
        'junction' = 'jct',
        'place' = 'pl',
        'road' = 'rd',
        'route' = 'rte',
        'square' = 'sq',
        'street' = 'st',
        'apartment' = 'apt',
        'building' = 'bldg'
      )
    ) %>%
    str_remove_all('[[:punct:]]+') %>%
    str_squish() %>%
    return()
}

po_box_parser = function(address_field_1) {
  # Adapted from: https://stackoverflow.com/questions/5680050/po-box-regular-expression-validation/25271289#25271289
  # Accepts only one field
  
  address_field_1 = str_to_lower(address_field_1) %>% str_remove_all('\\(|\\+|\\)')
  
  po_string = "(?i)^ *(((p o box|p o drawer|pmb|box|bin)[-. \\/\\\\]?\\d+)|(postoffice)|(pro box? *\\d+)|(post office box)|(post office)|(po drawer)|(p(ost(al))? *(drawer)? *\\d+)|(.*p[ \\.]? ?(o|0|op)[-. \\/\\\\]? *-?((box|bin|nox)|b|(#|num)?\\d+))|(p(ost(al))? *(o(ff(ice)?)?)? *((box|bin)|b)? *\\d+)|(p *-?\\/?(o)? *-?box)|post office box|((box|bin)|b) *(number|num|#)? *\\d*\\d+)"
  po_extrac = "(?i) *(((p o box|p o drawer|pmb|box|bin)[-. \\/]?)|(postoffice)|(pro box? *)|(post office)|(post office box)|(po drawer)|(p(ost(al))? *(drawer))|(.*p[ \\.]? ?(o|0|op)[-. \\/]? *-?((box|bin|nox)|b|(#|num)?))|(p(ost(al))? *(o(ff(ice)?)?)? *((box|bin)|b)? *)|(p *-?\\/?(o)? *-?box)|post office box|((box|bin)|b) *(number|num|#)? *)"
  
  is_po_1 = str_detect(address_field_1, po_string) &
    !is.na(address_field_1)
  
  if (all(!is_po_1)) {
    return(rep(NA, length(address_field_1)))
  }
  
  po_nums_1 = address_field_1[is_po_1] %>%
    str_split(pattern = str_extract_all(., pattern = po_extrac, simplify = T)[, 1]) %>%
    map_chr( ~ str_squish(.x) %>% str_replace("\\D*(\\d+).*", "\\1") %>% str_c(collapse =
                                                                                 ''))
  
  out = rep(NA, length(address_field_1))
  out[is_po_1] = po_nums_1
  out = str_squish(out)
  out[out == ''] = NA
  
  return(out)
  
}

fix_owner_string = function(str) {
  str %>%
    str_remove('[[:digit:]]+') %>%
    str_remove('( REV(OC(ABLE)?)? )?( LIV(ING)? )? (TR(UST(EE)?(S)?)?(RST)?)( )?$') %>%
    str_remove('( )?(THE )') %>%
    str_remove('( CO-)?( TRUSTEE(S)?).*$') %>%
    str_remove('^(TRUSTEE(S)?) (OF )?') %>%
    str_remove('/(TRUSTEE)(S)?$') %>%
    str_remove('(/(CO-)?TR)$') %>%
    str_remove('(LIVING TRUST).*$') %>%
    str_remove('(^FAMILY)|( FAMILY )|( FAMILY$)') %>%
    str_remove('\\([^\\)]+\\)') %>%
    str_remove('^C\\/O') %>%
    str_remove('[/]?(SPOUSE$|WIFE$|( WF$))') %>%
    str_remove('[/]?[&]?(HUSBAND)') %>%
    str_remove('( LIFE)?( ESTATE)$') %>%
    str_remove('( REV(OC(ABLE)?)?)( LIV(ING)?)?$') %>%
    str_remove(' REV TRUST$| IRREVOCABLE$| REV TRUST$') %>%
    str_remove(' REVOCABLE ') %>%
    str_remove(' ET( )?AL$') %>%
    str_remove('&$') %>%
    str_remove('/(CO-)?(TR)$') %>%
    str_remove('( et al)') %>%
    str_remove('( LIVING)? TRUST( )?$') %>%
    str_remove('( JOINT)? TRUST(S)? ') %>%
    str_remove(' LIVING$') %>%
    str_remove('CUSTODIAN') %>%
    str_remove(' ESTATE') %>%
    str_squish()
}

parse_trust_name = function(name_field) {
  name_parts = name_field %>%
    str_remove_all('[:digit:]') %>%
    str_replace_all('(?i) lv tr(u(st)?)?$', '') %>%
    str_replace_all('(?i)rev(ocable)? liv(ing)?', '') %>%
    str_replace_all('(?i)rev(ocable)? tr(ust)?', '') %>%
    str_replace_all('(?i)living tr$', '') %>%
    str_replace_all('(?i)( fam(ily))?( liv(ing))?( land)? tr(ust)?$', '') %>%
    str_replace_all('(?i)living tr(ust)?$', '') %>%
    str_replace_all('(?i)family tr(ust)?$', '') %>%
    str_replace_all('(?i) trust$', '') %>%
    str_replace_all('(?i) family$', '') %>%
    str_replace_all('(?i) famil$', '') %>%
    str_replace_all('(?i) liv$', '') %>%
    str_replace_all('(?i) trus$', '') %>%
    str_replace_all('(?i) tr$', '') %>%
    str_replace_all('(?i) fam(ily)?$', '') %>%
    str_replace_all('(?i) liv(ing)?$', '') %>%
    str_replace_all('(?i) joint', '') %>%
    str_replace_all('(?i)( REV$)|( IRR$)| (SEPARATE$)|( FAMILY$)', '') %>%
    str_replace_all('(?i) TRUST$', '') %>%
    str_replace_all('(?i) TRUST$', '') %>%
    str_replace_all('(?i) TRUST ', '') %>%
    str_replace_all('(?i) FAMILY$', '') %>%
    str_trim() %>%
    str_replace_all(fixed('-'), ' ') %>%
    map_chr(prep.name) %>%
    map( ~ strsplit(.x, ' ')[[1]])
  
  surname_parts = name_parts %>%
    lapply(get.census.data) %>%
    lapply(determine.surname) %>%
    lapply('[[', 1) %>% unlist()
  
  out_parts = list(
    map2_chr(name_parts, surname_parts, ~ .x[.y]),
    map2_chr(name_parts, surname_parts, ~ .x[-.y] %>% str_c(collapse = ' '))
  )
  
  return(out_parts)
}

parse_trust_couple = function(name_field) {
  name_parts = name_field %>% str_remove_all('[:digit:]') %>%
    str_replace_all('(?i) lv tr(u(st)?)?$', '') %>%
    str_replace_all('(?i)rev(ocable)? liv(ing)?', '') %>%
    str_replace_all('(?i)rev(ocable)? tr(ust)?', '') %>%
    str_replace_all('(?i)living tr$', '') %>%
    str_replace_all('(?i)( fam(ily))?( liv(ing))?( land)? tr(ust)?$', '') %>%
    str_replace_all('(?i)living tr(ust)?$', '') %>%
    str_replace_all('(?i)family tr(ust)?$', '') %>%
    str_replace_all('(?i) trust$', '') %>%
    str_replace_all('(?i) family$', '') %>%
    str_replace_all('(?i) famil$', '') %>%
    str_replace_all('(?i) liv$', '') %>%
    str_replace_all('(?i) trus$', '') %>%
    str_replace_all('(?i) tr$', '') %>%
    str_replace_all('(?i) fam(ily)?$', '') %>%
    str_replace_all('(?i) liv(ing)?$', '') %>%
    str_replace_all('(?i) joint', '') %>%
    str_replace_all('(?i)( REV$)|( IRR$)| (SEPARATE$)|( FAMILY$)', '') %>%
    str_replace_all('(?i) TRUST$', '') %>%
    str_replace_all('(?i) TRUST$', '') %>%
    str_replace_all('(?i) TRUST ', '') %>%
    str_trim() %>%
    str_replace_all(fixed('-'), ' ')
  
  surnames = str_remove(name_parts, ' .*$')
  
  forenames = name_parts %>%
    substring(nchar(surnames) + 1) %>%
    str_trim() %>% str_split('( and | & | AND )')
  
  forenames1 = forenames %>% sapply(function(x)
    x[1], USE.NAMES = F)
  forenames2 = forenames %>% sapply(function(x)
    x[2], USE.NAMES = F)
  
  out_parts = list(
    ifelse(is.na(forenames2), name_parts, surnames),
    # last one
    ifelse(is.na(forenames2), NA, forenames1),
    # first one
    ifelse(is.na(forenames2), name_parts, surnames),
    # last two (same)
    forenames2 # first two
  )
  
  return(out_parts)
  
}

parse_trust_20 = function(dt) {
  # parse trusts (and couples) for 2020
  # these return a four-tuple of length all_wide of last first, last first where NAs is ignored
  
  # identify trusts
  is_trust_1 = str_detect(dt$buyer1_full,
                           '(?i)( TR(UST)?$)|( TRUST )|( FAMIL(Y)?$)|( TRUS$)|( LIV$)|( REV TR)') &
    (dt$buyer1_corp == '')
  
  is_trust_2 = str_detect(dt$buyer2_full,
                           '(?i)( TR(UST)?$)|( TRUST )|( FAMIL(Y)?$)|( TRUS$)|( LIV$)|( REV TR)') &
    (dt$buyer2_corp == '')
  
  is_couple = str_detect(dt$buyer1_full, '(?i)( & )|( AND )') &
    (dt$buyer1_corp == '')
  
  # replace the trust language and parse first name into only l1 f1 fields
  
  # if one is trust and not couple, parse one
  out = parse_trust_name(dt[is_trust_1 & !is_couple]$buyer1_full)
  dt[(is_trust_1 &
        !is_couple), (c('buyer1_last', 'buyer1_fm')) := out]
  
  # if two is trust, parse two
  out = parse_trust_name(dt[is_trust_2]$buyer2_full)
  dt[(is_trust_2), (c('buyer2_last', 'buyer2_fm')) := out]
  
  # if just one is trust and couple, parse and replace both
  out = parse_trust_couple(dt[is_trust_1 & is_couple]$buyer1_full)
  dt[(is_trust_1 &
        is_couple), (c('buyer1_last', 'buyer1_fm', 'buyer2_last', 'buyer2_fm')) := out]
  
  
  return(dt[, c('buyer1_last', 'buyer1_fm', 'buyer2_last', 'buyer2_fm')])
}

fix_last_names = function(st) {
  st %>%
    str_replace_all(c(
      '^MC ' = 'MC',
      '^O ' = 'O',
      '^ST ' = 'ST'
    )) %>%
    str_remove('AGENT FOR( THE)?|AGENT OF( THE)?|AGT OF( THE)?|AGT FOR( THE)?| AGT$| AGENT$') %>%
    str_remove(' HEIR(S)? (OF )?| HEIR(S?)$') %>%
    return()
}
